

/*------------------------------------------------------------------------------
	1. Housekeeping
------------------------------------------------------------------------------*/
	*Run housekeeping code
		qui do ".../replication_package/housekeeping.do"

	*Write log
		log using "$log_loc/figure_5_panel_b.log", text replace

/*------------------------------------------------------------------------------
	2. Clean Crosswalk
------------------------------------------------------------------------------*/

	*Loop through years: 
		local year = 2018
		*Import zip_code-tract crosswalk
			import excel using "${data}/Zip-Census Tract/ZIP_TRACT_12`year'.xlsx", clear first

				*Make variables lower case
					ren *, lower
	
				*Keep variables needed for the crosswalk
					keep tract tot_ratio zip
	
				*Rename zip code
					ren zip zip_code

				*Drop tract-zip combinations where tract does not correspond to any zip code
					drop if tot_ratio == 0
	
				*Check id
					isid tract zip_code
	
			*Save dataset 
			tempfile zip_cross_`year'
			save `zip_cross_`year'', replace

/*------------------------------------------------------------------------------
	3. Estimate share of (kids age_numd 0-14)/(kids age_numd 0-17)
------------------------------------------------------------------------------*/

			import excel using "$data/Census Count/nc-est2019-syasexn.xlsx", clear cellra(A6:M23)
				keep A K L M
				ren (A K L M) (age_num census_2017 census_2018 census_2019)

			tempfile census_data
			save `census_data', replace

			import excel using "$data/Census Count/nc-est2022-syasexn.xlsx", clear cellra(A6:E23)
				keep A C D
				ren (A C D ) (age_num census_2020 census_2021)

				merge 1:1 age_num using `census_data'
					assert _merge == 3
					drop _merge

				replace age_num = substr(age_num, 2, .)
					destring age_num, replace
					sort age_num
	
				*Make data long (i.e., by age_num and year instead of just by age_num)
					reshape long census_, i(age_num) j(year)
					ren census_ census_ct
					isid year age_num

				*Keep if year is the one to be analyzed
					keep if year == `year'

				*Estimate the share of children under 14
					qui sum if age_num <= 14,d
					local ct_14_and_under = r(sum)

					qui sum if age_num <= 17, d
					local ct_17_and_under = r(sum)

					local share_14_and_under = `ct_14_and_under'/`ct_17_and_under'
					di "For `year', there are `ct_14_and_under' children 14 and under and `ct_17_and_under' children 17 and under"
					di "This corresponds to a share of `share_14_and_under' children being 14 and under"

/*------------------------------------------------------------------------------
	4. Prepare Kids Per Tract Dataset
------------------------------------------------------------------------------*/		
		*Import Household Age Data
		import delimited using "${data}/B09001/ACSDT5Y`year'.B09001-Data.csv", clear rowrange(2) varn(2)

			*Keep key variables
				keep geography geographicareaname estimatetotal estimatetotalinhouseholds estimatetotalinhouseholdsunder3y estimatetotalinhouseholds3and4ye estimatetotalinhouseholds5years estimatetotalinhouseholds6to8yea estimatetotalinhouseholds9to11ye estimatetotalinhouseholds12to14y estimatetotalinhouseholds15to17y

			*Rename variables
				ren (geography geographicareaname estimatetotal estimatetotalinhouseholds estimatetotalinhouseholdsunder3y estimatetotalinhouseholds3and4ye estimatetotalinhouseholds5years estimatetotalinhouseholds6to8yea estimatetotalinhouseholds9to11ye estimatetotalinhouseholds12to14y estimatetotalinhouseholds15to17y) (tract tract_name total in_hh age_num_less_than_3 age_num_3_to_4 age_num_5 age_num_6_to_8 age_num_9_to_11 age_num_12_to_14 age_num_15_to_17)

			*Clean tract variable (last 11 values in tract variable)
				replace tract = substr(tract, -11, 11)

			*Get ct from 0-14
				gen age_num_0_14_hh = age_num_less_than_3 + age_num_3_to_4 + age_num_5 + age_num_6_to_8 + age_num_9_to_11 + age_num_12_to_14
			
			/*Estimate the proportion of non-household kids that are 14 by multiplying the number of kids (0-17) 
			by the proportion of kids in Census that are 0-14*/ 
				gen in_group_quarters = total - in_hh
				gen age_num_0_14_gq = in_group_quarters * `share_14_and_under'

			*Estimate the total number of kids that are 0-14
				gen age_num_0_14_acs = age_num_0_14_gq + age_num_0_14_hh

			*Keep key variables
				keep tract age_num_0_14_acs 

		tempfile kids
		save `kids', replace

/*------------------------------------------------------------------------------
	5. Save education-by-tract dataset
------------------------------------------------------------------------------*/

		*Import Education Data
		import delimited using "${data}/DPO2/ACSDP5Y`year'.DP02-Data.csv", clear rowrange(2) varn(2)

				
				if `year' == 2018 {
					*Keep Key variables
						keep geography geographicareaname estimateeducationalattainmentpop v125 v127 v135

					*Destring variables. 
					foreach var of varlist estimateeducationalattainmentpop v125 v127 v135 {
						replace `var' = "" if `var' == "null"
						drop if `var' == ""
						destring `var', replace				
					}

					*Generate key variables
						gen some_coll_plus_over_25 = v125 + v127 + v135
					
				} 


			*Rename variables
				ren (geography geographicareaname estimateeducationalattainmentpop) (tract tract_name tot_pop_over_25)
	
			*Clean tract variable (last 11 values in tract variable)
				replace tract = substr(tract, -11, 11)

			*merge with kids dataset
				merge 1:1 tract using `kids'
					keep if _merge == 3
					drop _merge

				merge 1:m tract using `zip_cross_`year''
					keep if _merge == 3
					drop _merge

			/*Tot_ratio = ratio of addresses in tract-zip record to total number of addresses in tract
			To allocate data from tract to zip, multiple number of observations in tract by tot_ratio for each zip associated with tract */
			foreach i of varlist age_num_0_14_acs some_coll_plus_over_25  tot_pop_over_25  {
				replace `i' = `i' * tot_ratio
			}

			collapse (sum) age_num_0_14_acs some_coll_plus_over_25  tot_pop_over_25, by(zip_code)

			*Generate pct race estimates by zip code
				gen pct_some_coll_plus_over_25 = 100 * some_coll_plus_over_25/tot_pop_over_25

	
			tempfile educ_data_`year'
			save `educ_data_`year'', replace

/*------------------------------------------------------------------------------
	4. Clean Main Datasets
------------------------------------------------------------------------------*/

		*Import Data
		use "$data/childrens`year'.dta", clear

			/*Restrict Sample*/
				*Keep observations in the 51 states (per the f1095 variable). Note that most of the dropped state values are missing.
					rename state_insurance state
					fmerge m:1 state using "$data/statelist.dta"
						assert _merge != 2 //Confirm that all 51 states are in dataset
						drop if _merge != 3

				*Keep if age_num is within the range
					keep if age_num >= 0 & age_num <= 16

				*Keep if kid died after end of year or is currently alive
					keep if `year'*10000+1231<death_date|death_date==0

				*Keep if months covered is possible
					keep if num_cov_ins > 0 & num_cov_ins <= 12

			/*Generate variables for figure*/		
				*Generate outcame variable on claiming
					gen onreturn = file_inc != .

				*Rename zip code variable 
					ren (zipcode_ins) (zip_code)

				*Generate a count variable
					gen ct = 1

				*Collapse to the by-zipcode level		
					collapse (sum) ct onreturn, by(zip_code)

				*Adjust for claiming limits
					replace onreturn = onreturn * 1.0017029

				*Generate outcome (i.e., percent of children claimed)
					gen perc_return = onreturn/ct			

				*Adjust outcome to be out of 100
					replace perc_return = perc_return*100
			
				*Merge with main dataset
					merge 1:1 zip_code using `educ_data_`year''
						keep if _merge == 3

/*------------------------------------------------------------------------------
	3. Create Figure 2
------------------------------------------------------------------------------*/

			/*Create figure*/		
				binscatter perc_return pct_some_coll_plus_over_25 [w = age_num_0_14_acs], nq(20) ///
				xtitle("% Households with Some College Education or Higher") ytitle("% Children Claimed") ///
				reportreg

				graph export "${output}/`year' Percent with Some College.png", replace as(png)

	